/
ConvertText.py
130 lines (119 loc) · 5.14 KB
/
ConvertText.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#--------------------------------------Convert Attachment (DOC & PDF) Comments to Text---------------------------------#
#---------------------------------------------The GW Regulatory Studies Center-----------------------------------------#
#--------------------------------------------------Author: Zhoudan Xie-------------------------------------------------#
# Import packages
import sys
import os
import comtypes.client
from PIL import Image
import pytesseract
import sys
from pdf2image import convert_from_path
import fitz
import json
filePath="Retrieve Comments/Comment Attachments/" #! Specify the path of the folder where the comment attachments are saved
#-------------------------------------------Convert DOC files to PDF----------------------------------------------------
# Define a function to convert doc to pdf
def docToPdf(filePath,fileName):
wdFormatPDF = 17
in_file = os.path.abspath(filePath+fileName+'.doc')
out_file = os.path.abspath(filePath+fileName+'.pdf')
word = comtypes.client.CreateObject('Word.Application')
word.Visible = False
doc = word.Documents.Open(in_file)
doc.SaveAs(out_file, FileFormat=wdFormatPDF)
doc.Close()
word.Quit()
# Convert DOC comments to PDF
for file in os.listdir(filePath):
if file.endswith(".doc"):
fileName = str(file).split('.doc')[0]
if os.path.isfile(filePath + fileName + ".pdf"):
pass
else:
docToPdf(filePath,fileName)
#---------------------------------------------Convert PDF files to text-------------------------------------------------
# Define a function to convert scanned PDF to text
def convertScanPDF(file):
## Part 1 : Converting PDF to images
# Store all the pages of the PDF in a variable
pages = convert_from_path(file, 500)
# Counter to store images of each page of PDF to image
image_counter = 1
# Iterate through all the pages stored above
for page in pages:
# Declaring filename for each page of PDF as JPG
# For each page, filename will be:
# PDF page 1 -> page_1.jpg
# ....
# PDF page n -> page_n.jpg
filename = "page_" + str(image_counter) + ".jpg"
# Save the image of the page in system
page.save(filename, 'JPEG')
# Increment the counter to update filename
image_counter = image_counter + 1
##Part 2 - Recognizing text from the images using OCR
# Variable to get count of total number of pages
filelimit = image_counter - 1
text=''
# Iterate from 1 to total number of pages
for i in range(1, filelimit + 1):
# Set filename to recognize text from
# Again, these files will be:
# page_1.jpg
# page_2.jpg
# ....
# page_n.jpg
filename = "page_" + str(i) + ".jpg"
# Recognize the text as string in image using pytesserct
new_text = str(((pytesseract.image_to_string(Image.open(filename)))))
# The recognized text is stored in variable text.
# Any string processing may be applied on text
# Here, basic formatting has been done: In many PDFs, at line ending, if a word can't be written fully,
# a 'hyphen' is added. The rest of the word is written in the next line. Eg: This is a sample text this
# word here GeeksF-orGeeks is half on first line, remaining on next. To remove this, we replace every '-\n' to ''.
new_text = new_text.replace('-\n', '')
# Finally, write the processed text to the file.
text += new_text
return text
# Convert PDF comments to text
dic_pdfComments={}
notConverted=[]
for file in os.listdir(filePath):
if file.endswith(".pdf"):
doc = fitz.open(filePath+file)
fileName=str(file).split('.pdf')[0]
num_pages = doc.pageCount
count = 0
text = ""
while count < num_pages:
page = doc[count]
count += 1
text += page.getText('text')
if text != "":
text=text.replace('\n',' ')
dic_pdfComments.update({fileName: text})
else:
try:
text = convertScanPDF(filePath+file)
text = text.replace('\n', ' ')
dic_pdfComments.update({fileName: text})
except:
notConverted.append(file)
doc.close
print("The number of PDF files that have been converted to text is:", len(dic_pdfComments))
if len(notConverted)>0:
print("The following PDF files could not be converted:")
print(notConverted)
print("END")
# Print an example
print(dic_pdfComments.keys())
for key, value in dic_pdfComments.items():
if key=="EPA-HQ-OA-2017-0190-32609_1": #! Print the text of a specified document
print(key, ":", value)
#---------------------------------------------Export converted text-------------------------------------------------
# Export to JSON
## Output file will include text from all converted comments in one file
js_pdfComments=json.dumps(dic_pdfComments)
with open('Retrieve Comments/Attachment Comments Example.json', 'w', encoding='utf-8') as f: #! Specify the file to which you want to export the JSON
json.dump(js_pdfComments, f, ensure_ascii=False, indent=4)